Desarrollo#

Punto 2.#

C10.png

Punto 3.#

Docker:

C7.png

pgAdmin:

C8.png

Punto 4.#

I) Crear la tabla nombrada: employees y explicar que tarea realiza la consulta realizada y mostrar en pantalla la tabla

import psycopg2
from psycopg2 import Error
connection = psycopg2.connect(user="undatascience-user",
                              password="undatascience-password",
                              host="localhost",
                              port="5432",
                              database="undatascience-db")
cursor = connection.cursor()

create_table_query = '''CREATE TABLE employees (
    employee_id INTEGER,
    first_name VARCHAR(20),
    last_name VARCHAR(25),
    email VARCHAR(25),
    phone_number VARCHAR(20),
    hire_date DATE,
    job_id VARCHAR(10),
    salary NUMERIC(8,2),
    commission_pct NUMERIC(2,2),
    manager_id INTEGER,
    department_id INTEGER,
    PRIMARY KEY (employee_id)
);'''

cursor.execute(create_table_query)
print("Table 'employees' created successfully")

cursor.execute("SELECT * FROM employees")
rows = cursor.fetchall()

print("\nTable 'employees':")
for row in rows:
    print(row)

connection.commit()
connection.close()
Table 'employees' created successfully

Table 'employees':

SELECT * FROM employees LIMIT 10; Esta consulta selecciona todas las columnas de la tabla employees y muestra los primeros 10 registros.

SELECT count(1) FROM employees; Esta consulta cuenta el número total de registros en la tabla employees.

C1.png

II) Crear la tabla courses

import psycopg2

connection = psycopg2.connect(user="undatascience-user",
                              password="undatascience-password",
                              host="localhost",
                              port="5432",
                              database="undatascience-db")
cursor = connection.cursor()

create_table_query = '''CREATE TABLE IF NOT EXISTS courses (
    course_id SERIAL PRIMARY KEY,
    course_name VARCHAR(60),
    course_author VARCHAR(40),
    course_status VARCHAR(20),
    course_published_dt DATE
);'''

cursor.execute(create_table_query)
print("Table 'courses' created successfully")

connection.commit()
connection.close()
Table 'courses' created successfully

III) Insertar datos

import psycopg2
from datetime import date

data_to_insert = [
    ("Programming using Python", "Bob Dillon", "published", date(2020, 9, 30)),
    ("Data Engineering using Python", "Bob Dillon", "published", date(2020, 7, 15)),
    ("Data Engineering using Scala", "Elvis Presley", "draft", None),
    ("Programming using Scala", "Elvis Presley", "published", date(2020, 5, 12)),
    ("Programming using Java", "Mike Jack", "inactive", date(2020, 8, 10)),
    ("Web Applications - Python Flask", "Bob Dillon", "inactive", date(2020, 7, 20)),
    ("Web Applications - Java Spring", "Mike Jack", "draft", None),
    ("Pipeline Orchestration - Python", "Bob Dillon", "draft", None),
    ("Streaming Pipelines- Python", "Bob Dillon", "published", date(2020, 10, 5)),
    ("Web Applications - Scala Play", "Elvis Presley", "inactive", date(2020, 9, 30)),
    ("Web Applications - Python Django", "Bob Dillon", "published", date(2020, 6, 23)),
    ("Server Automation - Ansible", "Uncle Sam", "published", date(2020, 7, 5))
]

connection = psycopg2.connect(user="undatascience-user",
                              password="undatascience-password",
                              host="localhost",
                              port="5432",
                              database="undatascience-db")
cursor = connection.cursor()

insert_query = '''INSERT INTO courses (course_name, course_author, course_status, course_published_dt)
                  VALUES (%s, %s, %s, %s);'''

cursor.executemany(insert_query, data_to_insert)
print("Data inserted into 'courses' table successfully")

connection.commit()
connection.close()
Data inserted into 'courses' table successfully

C2.png

IV) Borre todos los cursos que no estén en modo borrador ni publicados. Proporcione la sentencia de borrado como respuesta para este ejercicio en el Jupyter Book. Para validar, obtenga el recuento de todos los cursos publicados por autor y asegúrese de que la salida está ordenada en forma descendente por recuento.

import psycopg2

connection = psycopg2.connect(user="undatascience-user",
                              password="undatascience-password",
                              host="localhost",
                              port="5432",
                              database="undatascience-db")
cursor = connection.cursor()

delete_query = '''DELETE FROM courses
                  WHERE course_status NOT IN ('draft', 'published');'''


cursor.execute(delete_query)
print("Courses not in draft or published status deleted successfully")

count_query = '''SELECT course_author, COUNT(*) AS course_count
                 FROM courses
                 WHERE course_status = 'published'
                 GROUP BY course_author
                 ORDER BY course_count DESC;'''

cursor.execute(count_query)
results = cursor.fetchall()

# Mostrar los resultados
print("\nCount of published courses by author (descending order):")
for row in results:
    print(row)

connection.commit()
connection.close()
Courses not in draft or published status deleted successfully

Count of published courses by author (descending order):
('Bob Dillon', 4)
('Elvis Presley', 1)
('Uncle Sam', 1)

C3.png

VI) Crear la base de datos users

import psycopg2

connection = psycopg2.connect(user="undatascience-user",
                              password="undatascience-password",
                              host="localhost",
                              port="5432",
                              database="undatascience-db")
cursor = connection.cursor()

create_table_query = '''CREATE TABLE users(
    user_id SERIAL PRIMARY KEY,
    user_first_name VARCHAR(30),
    user_last_name VARCHAR(30),
    user_email_id VARCHAR(50),
    user_gender VARCHAR(1),
    user_unique_id VARCHAR(15),
    user_phone_no VARCHAR(20),
    user_dob DATE,
    created_ts TIMESTAMP
);'''

cursor.execute(create_table_query)
print("Table 'users' created successfully")

connection.commit()
connection.close()
Table 'users' created successfully

C4.png

import psycopg2

connection = psycopg2.connect(user="undatascience-user",
                              password="undatascience-password",
                              host="localhost",
                              port="5432",
                              database="undatascience-db")
cursor = connection.cursor()

insert_query = '''
INSERT INTO users (
    user_first_name, user_last_name, user_email_id, user_gender, 
    user_unique_id, user_phone_no, user_dob, created_ts
) VALUES
    ('Giuseppe', 'Bode', 'gbode0@imgur.com', 'M', '88833-8759', 
     '+86 (764) 443-1967', '1973-05-31', '2018-04-15 12:13:38'),
    ('Lexy', 'Gisbey', 'lgisbey1@mail.ru', 'F', '262501-029', 
     '+86 (751) 160-3742', '2003-05-31', '2020-12-29 06:44:09'),
    ('Karel', 'Claringbold', 'kclaringbold2@yale.edu', 'F', '391-33-2823', 
     '+62 (445) 471-2682', '1985-11-28', '2018-11-19 00:04:08'),
    ('Marv', 'Tanswill', 'mtanswill3@dedecms.com', 'F', '1195413-80', 
     '+62 (497) 736-6802', '1998-05-24', '2018-11-19 16:29:43'),
    ('Gertie', 'Espinoza', 'gespinoza4@nationalgeographic.com', 'M', '471-24-6869', 
     '+249 (687) 506-2960', '1997-10-30', '2020-01-25 21:31:10'),
    ('Saleem', 'Danneil', 'sdanneil5@guardian.co.uk', 'F', '192374-933', 
     '+63 (810) 321-0331', '1992-03-08', '2020-11-07 19:01:14'),
    ('Rickert', 'O''Shiels', 'roshiels6@wikispaces.com', 'M', '749-27-47-52', 
     '+86 (184) 759-3933', '1972-11-01', '2018-03-20 10:53:24'),
    ('Cybil', 'Lissimore', 'clissimore7@pinterest.com', 'M', '461-75-4198', 
     '+54 (613) 939-6976', '1978-03-03', '2019-12-09 14:08:30'),
    ('Melita', 'Rimington', 'mrimington8@mozilla.org', 'F', '892-36-676-2', 
     '+48 (322) 829-8638', '1995-12-15', '2018-04-03 04:21:33'),
    ('Benetta', 'Nana', 'bnana9@google.com', 'M', '197-54-1646', 
     '+420 (934) 611-0020', '1971-12-07', '2018-10-17 21:02:51'),
    ('Gregorius', 'Gullane', 'ggullanea@prnewswire.com', 'F', '232-55-52-58', 
     '+62 (780) 859-1578', '1973-09-18', '2020-01-14 23:38:53'),
    ('Una', 'Glayzer', 'uglayzerb@pinterest.com', 'M', '898-84-336-6', 
     '+380 (840) 437-3981', '1983-05-26', '2019-09-17 03:24:21'),
    ('Jamie', 'Vosper', 'jvosperc@umich.edu', 'M', '247-95-68-44', 
     '+81 (205) 723-1942', '1972-03-18', '2020-07-23 16:39:33'),
    ('Calley', 'Tilson', 'ctilsond@issuu.com', 'F', '415-48-894-3', 
     '+229 (698) 777-4904', '1987-06-12', '2020-06-05 12:10:50'),
    ('Peadar', 'Gregorowicz', 'pgregorowicze@omniture.com', 'M', '403-39-5-869', 
     '+7 (267) 853-3262', '1996-09-21', '2018-05-29 23:51:31'),
    ('Jeanie', 'Webling', 'jweblingf@booking.com', 'F', '399-83-05-03', 
     '+351 (684) 413-0550', '1994-12-27', '2018-02-09 01:31:11'),
    ('Yankee', 'Jelf', 'yjelfg@wufoo.com', 'F', '607-99-0411', 
     '+1 (864) 112-7432', '1988-11-13', '2019-09-16 16:09:12'),
    ('Blair', 'Aumerle', 'baumerleh@toplist.cz', 'F', '430-01-578-5', 
     '+7 (393) 232-1860', '1979-11-09', '2018-10-28 19:25:35'),
    ('Pavlov', 'Steljes', 'psteljesi@macromedia.com', 'F', '571-09-6181', 
     '+598 (877) 881-3236', '1991-06-24', '2020-09-18 05:34:31'),
    ('Darn', 'Hadeke', 'dhadekej@last.fm', 'M', '478-32-02-87', 
     '+370 (347) 110-4270', '1984-09-04', '2018-02-10 12:56:00'),
    ('Wendell', 'Spanton', 'wspantonk@de.vu', 'F', NULL, 
     '+84 (301) 762-1316', '1973-07-24', '2018-01-30 01:20:11'),
    ('Carlo', 'Yearby', 'cyearbyl@comcast.net', 'F', NULL, 
     '+55 (288) 623-4067', '1974-11-11', '2018-06-24 03:18:40'),
    ('Sheila', 'Evitts', 'sevittsm@webmd.com', NULL, '830-40-5287',
     NULL, '1977-03-01', '2020-07-20 09:59:41'),
    ('Sianna', 'Lowdham', 'slowdhamn@stanford.edu', NULL, '778-0845', 
     NULL, '1985-12-23', '2018-06-29 02:42:49'),
    ('Phylys', 'Aslie', 'paslieo@qq.com', 'M', '368-44-4478', 
     '+86 (765) 152-8654', '1984-03-22', '2019-10-01 01:34:28');
'''

cursor.execute(insert_query)
print("Valores insertados correctamente en la tabla 'users'")

connection.commit()
connection.close()
Valores insertados correctamente en la tabla 'users'

C5.png

VII) Obtenga el número de usuarios creados por año. Utilice la tabla de usuarios para este ejercicio.

import psycopg2

connection = psycopg2.connect(user="undatascience-user",
                              password="undatascience-password",
                              host="localhost",
                              port="5432",
                              database="undatascience-db")
cursor = connection.cursor()

query = '''
SELECT 
    DATE_PART('year', created_ts) AS created_year,
    COUNT(*) AS user_count
FROM 
    users
GROUP BY 
    created_year
ORDER BY 
    created_year ASC;
'''

cursor.execute(query)

results = cursor.fetchall()

print("Año de Creación\tUsuario Count")
for row in results:
    created_year = row[0]
    user_count = row[1]
    print(f"{created_year}\t\t{user_count}")

connection.close()
Año de Creación	Usuario Count
2018.0		13
2019.0		4
2020.0		8

VIII) Obtenga los días de nacimiento de todos los usuarios nacidos en el mes May.

import psycopg2

connection = psycopg2.connect(user="undatascience-user",
                              password="undatascience-password",
                              host="localhost",
                              port="5432",
                              database="undatascience-db")
cursor = connection.cursor()

query = '''
SELECT
    user_id,
    user_dob,
    user_email_id,
    TO_CHAR(user_dob, 'Day') AS user_day_of_birth
FROM
    users
WHERE
    EXTRACT(MONTH FROM user_dob) = 5
ORDER BY
    EXTRACT(DAY FROM user_dob);
'''

cursor.execute(query)

results = cursor.fetchall()

print("User ID\tDOB\t\t\tUser Email\t\tUser Day of Birth")
print("-" * 100)

for row in results:
    user_id = str(row[0]).ljust(8)
    user_dob = str(row[1]).ljust(20)
    user_email = row[2].ljust(30)
    user_day_of_birth = row[3].ljust(15)
    print(f"{user_id}{user_dob}{user_email}{user_day_of_birth}")

connection.close()
User ID	DOB			User Email		User Day of Birth
----------------------------------------------------------------------------------------------------
4       1998-05-24          mtanswill3@dedecms.com        Sunday         
12      1983-05-26          uglayzerb@pinterest.com       Thursday       
1       1973-05-31          gbode0@imgur.com              Thursday       
2       2003-05-31          lgisbey1@mail.ru              Saturday       

IX) Obtenga los nombres e ids de correo electrónico de los usuarios añadidos en el año 2019.

import psycopg2

connection = psycopg2.connect(user="undatascience-user",
                              password="undatascience-password",
                              host="localhost",
                              port="5432",
                              database="undatascience-db")
cursor = connection.cursor()

query = '''
SELECT
    user_id,
    CONCAT(UPPER(user_first_name), ' ', UPPER(user_last_name)) AS user_name,
    user_email_id,
    created_ts,
    EXTRACT(YEAR FROM created_ts) AS created_year
FROM
    users
WHERE
    EXTRACT(YEAR FROM created_ts) = 2019
ORDER BY
    user_name;
'''

cursor.execute(query)

results = cursor.fetchall()

print("User ID\tUser Name\t\t\tUser Email\t\tCreated Timestamp\tCreated Year")
print("-" * 100)

for row in results:
    user_id = str(row[0]).ljust(8)
    user_name = row[1].ljust(25)
    user_email = row[2].ljust(30)
    created_ts = str(row[3]).ljust(25)
    created_year = str(row[4]).ljust(15)
    print(f"{user_id}{user_name}{user_email}{created_ts}{created_year}")

connection.close()
User ID	User Name			User Email		Created Timestamp	Created Year
----------------------------------------------------------------------------------------------------
8       CYBIL LISSIMORE          clissimore7@pinterest.com     2019-12-09 14:08:30      2019           
25      PHYLYS ASLIE             paslieo@qq.com                2019-10-01 01:34:28      2019           
12      UNA GLAYZER              uglayzerb@pinterest.com       2019-09-17 03:24:21      2019           
17      YANKEE JELF              yjelfg@wufoo.com              2019-09-16 16:09:12      2019           

X) Obtenga el número de usuarios por género. Utilice la tabla de users para este ejercicio.

import psycopg2

connection = psycopg2.connect(user="undatascience-user",
                              password="undatascience-password",
                              host="localhost",
                              port="5432",
                              database="undatascience-db")
cursor = connection.cursor()

query = '''
SELECT
    CASE
        WHEN user_gender = 'M' THEN 'Male'
        WHEN user_gender = 'F' THEN 'Female'
        ELSE 'Not Specified'
    END AS gender,
    COUNT(*) AS user_count
FROM
    users
GROUP BY
    user_gender
ORDER BY
    user_count DESC;
'''

cursor.execute(query)

results = cursor.fetchall()

print("Gender\t\tUser Count")
print("-" * 100)  

for row in results:
    gender = row[0].ljust(15)
    user_count = str(row[1]).ljust(10)
    print(f"{gender}{user_count}")

connection.close()
Gender		User Count
----------------------------------------------------------------------------------------------------
Female         13        
Male           10        
Not Specified  2         

XII) Obtenga los 4 últimos dígitos de los ids únicos.

import psycopg2

connection = psycopg2.connect(user="undatascience-user",
                              password="undatascience-password",
                              host="localhost",
                              port="5432",
                              database="undatascience-db")
cursor = connection.cursor()

query = '''
SELECT
    user_id,
    COALESCE(user_unique_id, 'Not Specified') AS user_unique_id,
    CASE
        WHEN user_unique_id IS NULL THEN 'Not Specified'
        WHEN LENGTH(REPLACE(user_unique_id, '-', '')) < 9 THEN 'Invalid Unique Id'
        ELSE RIGHT(REPLACE(user_unique_id, '-', ''), 4)
    END AS user_unique_id_last4
FROM
    users
ORDER BY
    user_id;
'''

cursor.execute(query)

results = cursor.fetchall()

print("User ID\tUser Unique ID\t\t\tUser Unique ID Last 4")
print("-" * 100) 

for row in results:
    user_id = str(row[0]).ljust(8)
    user_unique_id = row[1].ljust(25)
    user_unique_id_last4 = row[2].ljust(20)
    print(f"{user_id}{user_unique_id}{user_unique_id_last4}")

connection.close()
User ID	User Unique ID			User Unique ID Last 4
----------------------------------------------------------------------------------------------------
1       88833-8759               8759                
2       262501-029               1029                
3       391-33-2823              2823                
4       1195413-80               1380                
5       471-24-6869              6869                
6       192374-933               4933                
7       749-27-47-52             4752                
8       461-75-4198              4198                
9       892-36-676-2             6762                
10      197-54-1646              1646                
11      232-55-52-58             5258                
12      898-84-336-6             3366                
13      247-95-68-44             6844                
14      415-48-894-3             8943                
15      403-39-5-869             5869                
16      399-83-05-03             0503                
17      607-99-0411              0411                
18      430-01-578-5             5785                
19      571-09-6181              6181                
20      478-32-02-87             0287                
21      Not Specified            Not Specified       
22      Not Specified            Not Specified       
23      830-40-5287              5287                
24      778-0845                 Invalid Unique Id   
25      368-44-4478              4478                

XIII) Obtenga el recuento de usuarios en función del código de país.

import psycopg2

connection = psycopg2.connect(user="undatascience-user",
                              password="undatascience-password",
                              host="localhost",
                              port="5432",
                              database="undatascience-db")
cursor = connection.cursor()

query = '''
SELECT
    CAST(SUBSTRING(user_phone_no FROM '\+(\d{1,3})') AS INT) AS country_code,
    COUNT(*) AS user_count
FROM
    users
WHERE
    user_phone_no IS NOT NULL
    AND user_phone_no <> ''
    AND user_phone_no ~* '^\+\d{1,3}'
GROUP BY
    country_code
ORDER BY
    country_code;
'''

cursor.execute(query)

results = cursor.fetchall()

print("Country Code\tUser Count")
print("-" * 100)
for row in results:
    country_code = str(row[0]).ljust(15)
    user_count = str(row[1]).ljust(10)
    print(f"{country_code}{user_count}")

connection.close()
Country Code	User Count
----------------------------------------------------------------------------------------------------
1              1         
7              2         
48             1         
54             1         
55             1         
62             3         
63             1         
81             1         
84             1         
86             4         
229            1         
249            1         
351            1         
370            1         
380            1         
420            1         
598            1         

Punto 5.#

import sqlite3
import pandas as pd
from sqlite3 import Error

def create_connection(db_file):
    """ create a database connection to a SQLite database """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        print(sqlite3.version)
    except Error as e:
        print(e)
    finally:
        if conn:
            conn.close()

if __name__ == '__main__':
    create_connection(r"sqlitedb.db")
2.6.0
df_ADA = pd.read_csv("https://raw.githubusercontent.com/lihkir/Uninorte/main/AppliedStatisticMS/DataVisualizationRPython/Lectures/Python/PythonDataSets/ADA-USD.csv")
df_ADA.head()
Date Open High Low Close Adj Close Volume
0 2017-10-01 0.021678 0.032226 0.017354 0.024969 0.024969 50068700.0
1 2017-10-02 0.024607 0.030088 0.019969 0.025932 0.025932 57641300.0
2 2017-10-03 0.025757 0.027425 0.020690 0.020816 0.020816 16997800.0
3 2017-10-04 0.020864 0.022806 0.020864 0.021931 0.021931 9000050.0
4 2017-10-05 0.021951 0.022154 0.020859 0.021489 0.021489 5562510.0
df_ADA = df_ADA.rename({'Date': 'date', 
               'Open': 'open', 
               'High': 'high', 
               'Low': 'low', 
               'Close': 'close', 
               'Adj Close': 'adj_close', 
               'Volume': 'volume'}, axis=1)
df_ADA.head()
date open high low close adj_close volume
0 2017-10-01 0.021678 0.032226 0.017354 0.024969 0.024969 50068700.0
1 2017-10-02 0.024607 0.030088 0.019969 0.025932 0.025932 57641300.0
2 2017-10-03 0.025757 0.027425 0.020690 0.020816 0.020816 16997800.0
3 2017-10-04 0.020864 0.022806 0.020864 0.021931 0.021931 9000050.0
4 2017-10-05 0.021951 0.022154 0.020859 0.021489 0.021489 5562510.0
names_list = df_ADA.columns.tolist()
names_list
['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume']
import sqlalchemy
from sqlalchemy import create_engine
engine = create_engine('sqlite:///sqlitedb.db')
df_ADA.to_sql('ADA', engine, if_exists = 'replace', index=False, method='multi')
engine.dispose()
connection = sqlite3.connect('sqlitedb.db')

cursor = connection.cursor()
cursor.execute("SELECT * FROM ADA;")
close = cursor.fetchall()
ADA_sql = pd.DataFrame(close)
display(ADA_sql.head())

ADA_names = []
cnames_list = cursor.execute('''SELECT * FROM ADA''') 
for column in cnames_list.description: 
    ADA_names.append(column[0])
    
display(ADA_names)
print("\nOperation done successfully")
connection.close()
0 1 2 3 4 5 6
0 2017-10-01 0.021678 0.032226 0.017354 0.024969 0.024969 50068700.0
1 2017-10-02 0.024607 0.030088 0.019969 0.025932 0.025932 57641300.0
2 2017-10-03 0.025757 0.027425 0.020690 0.020816 0.020816 16997800.0
3 2017-10-04 0.020864 0.022806 0.020864 0.021931 0.021931 9000050.0
4 2017-10-05 0.021951 0.022154 0.020859 0.021489 0.021489 5562510.0
['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume']
Operation done successfully
ADA_sql.columns = names_list 
ADA_sql = ADA_sql.sort_values(by = "date")
ADA_sql.head()
date open high low close adj_close volume
0 2017-10-01 0.021678 0.032226 0.017354 0.024969 0.024969 50068700.0
1 2017-10-02 0.024607 0.030088 0.019969 0.025932 0.025932 57641300.0
2 2017-10-03 0.025757 0.027425 0.020690 0.020816 0.020816 16997800.0
3 2017-10-04 0.020864 0.022806 0.020864 0.021931 0.021931 9000050.0
4 2017-10-05 0.021951 0.022154 0.020859 0.021489 0.021489 5562510.0

Candlestick Chart#

import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.io as pio

fig = go.Figure(data=[go.Candlestick(x=ADA_sql['date'],
                                     open=ADA_sql['open'],
                                     high=ADA_sql['high'],
                                     low=ADA_sql['low'],
                                     close=ADA_sql['close'])])

fig.update_layout(title='Cardano USD (ADA-USD) Candlestick Chart',
                  xaxis_title='Day',
                  yaxis_title='ADA-USD')

pio.write_html(fig, file='candlestick_chart.html', auto_open=False)

fig.show()

from IPython.display import HTML
HTML(filename='candlestick_chart.html')

EDA#

Se describe un DataFrame con 7 columnas y 1475 filas. Las columnas contienen datos de diferentes tipos, principalmente float64(números decimales) y object.

ADA_sql.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1475 entries, 0 to 1474
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   date       1475 non-null   object 
 1   open       1471 non-null   float64
 2   high       1471 non-null   float64
 3   low        1471 non-null   float64
 4   close      1471 non-null   float64
 5   adj_close  1471 non-null   float64
 6   volume     1471 non-null   float64
dtypes: float64(6), object(1)
memory usage: 80.8+ KB

Los precios (open, high, low, close, adj_close) muestran una variabilidad considerable, como se evidencia por las desviaciones estándar relativamente altas en comparación con las medias. El rango de los precios (min-max) indica que hubo momentos de alta volatilidad en el mercado de ADA. Los valores máximos (max) para open, high, low, close, y adj_close son significativamente más altos que los valores al 75%, lo que sugiere la presencia de outliers o momentos de precios excepcionalmente altos. Los volúmenes también muestran una gran variabilidad, con un rango amplio y una desviación estándar alta en comparación con la media. En general, parece haber periodos de alta actividad en el mercado, como lo indican los valores máximos de volume.

ADA_sql.describe()
open high low close adj_close volume
count 1471.000000 1471.000000 1471.000000 1471.000000 1471.000000 1.471000e+03
mean 0.374234 0.393359 0.354385 0.375726 0.375726 1.090947e+09
std 0.609602 0.639195 0.578952 0.611501 0.611501 2.258370e+09
min 0.018414 0.021050 0.017354 0.018539 0.018539 1.739460e+06
25% 0.047560 0.049503 0.046073 0.047592 0.047592 5.366896e+07
50% 0.093398 0.096776 0.090105 0.093434 0.093434 1.359691e+08
75% 0.301339 0.318211 0.283567 0.301889 0.301889 6.999972e+08
max 2.966390 3.099186 2.907606 2.968239 2.968239 1.914198e+10

Validamos la cantidad de registros únicos en cada columna del DataFrame

ADA_sql.nunique()
date         1475
open         1468
high         1465
low          1463
close        1467
adj_close    1467
volume       1471
dtype: int64

Se verifica que el DataFrame contiene valores nulos, en las columnas open, high, low, close, adj_close, volume.

ADA_sql.isnull().sum()
date         0
open         4
high         4
low          4
close        4
adj_close    4
volume       4
dtype: int64
import sweetviz as sv
svdf =sv.analyze(ADA_sql)
svdf.show_notebook()
c:\Users\Andres\psqlpy_venv\Lib\site-packages\tqdm\auto.py:21: TqdmWarning:

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

Done! Use 'show' commands to display/save.   |██████████| [100%]   00:00 -> (00:00 left)

Podemos ver que hay un aumento de precios para ADA en 2021, lo que implica que todos son una buena inversión.

import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt

def gplot(ax, data, price, label, ylabel, color):
    ax.plot(data.index, data[price], label=label, color=color)
    ax.legend()
    ax.set_xlabel("Time")
    ax.set_ylabel(ylabel)

f = plt.figure(figsize=(15, 12))

columns_to_plot = ["open", "high", "low", "close", "adj_close", "volume"]

for i, col in enumerate(columns_to_plot, start=1):
    ax = f.add_subplot(3, 2, i)
    gplot(ax, ADA_sql, col, f"Cardano {col.capitalize()} Performance", f"ADA {col.capitalize()} Price", "skyblue")

plt.tight_layout()

plt.show()
_images/5d194b38e73daaa462c23c48acf589babe8d10f0a7da2cccb50bcd92c21922dc.png
ADA_sql['date'] = pd.to_datetime(ADA_sql['date'])
ADA_sql.set_index('date', inplace=True)
def mini_data(data):
    df=data[["close","low","high","open"]]
    return df

ada1_mini=mini_data(ADA_sql)
from scipy import stats
ada_month=ada1_mini.resample("M").mean()
ada_month["close_box"], lmbda=stats.boxcox(ada_month.close)
C:\Users\Andres\AppData\Local\Temp\ipykernel_1536\3795962630.py:2: FutureWarning:

'M' is deprecated and will be removed in a future version, please use 'ME' instead.

Descomposición Estacionaria: Se separa la serie temporal en sus componentes principales: tendencia, estacionalidad y residuo (o componente irregular). La idea es descomponer la serie en estas partes para comprender mejor su comportamiento y patrones subyacentes.

Serie Original: Es la serie temporal original sin ninguna transformación.

Tendencia: Muestra la dirección general de los datos a largo plazo, ignorando las fluctuaciones estacionales y aleatorias. Los datos tienen una tendencia creciente a lo largo del tiempo.

Estacionalidad: Representa los patrones que se repiten a intervalos regulares. Se pueden identificar ciclos o patrones que se repiten en ciertos períodos de tiempo.

Residuo (Componente Irregular): Es lo que queda después de que se hayan eliminado la tendencia y la estacionalidad. Representa la variabilidad que no se puede explicar por la tendencia ni por la estacionalidad. Un residuo ideal debería parecerse a ruido blanco, es decir, no debe tener patrones reconocibles.

plt.rcParams["figure.figsize"]=(20,15)

def season_df(data, label):
    df=data.resample("M").mean()
    seasonal_decompose(df.close).plot()
    print(label)
    return plt.show()

season_df(data=ada1_mini, label="ADA Seasonal Decomposition")
C:\Users\Andres\AppData\Local\Temp\ipykernel_1536\701762445.py:4: FutureWarning:

'M' is deprecated and will be removed in a future version, please use 'ME' instead.
ADA Seasonal Decomposition
_images/aa07db5321ebf5b90b5dd687ac1b6d9102bca54c2725964574441596d4655f4a.png

Box - Cox Transformation Esta transformación es útil cuando los datos muestran una varianza que cambia con el nivel de los datos. Al transformar los datos, se busca hacer que la varianza sea más constante. También se utiliza para hacer que los datos se aproximen más a una distribución normal.

ada_month=ada1_mini.resample("M").mean()
ada_month["close_box"], lmbda=stats.boxcox(ada_month.close)
C:\Users\Andres\AppData\Local\Temp\ipykernel_1536\2132941290.py:2: FutureWarning:

'M' is deprecated and will be removed in a future version, please use 'ME' instead.
from itertools import product
from statsmodels.tsa.statespace.sarimax import SARIMAX
import warnings
qs = range(0, 3)
ps = range(0, 3)
d=1
parameters = product(ps, qs)
parameters_list = list(parameters)
len(parameters_list)

# Model Selection
results = []
best_aic = float("inf")
warnings.filterwarnings('ignore')
for param in parameters_list:
    try:
        model = SARIMAX(ada_month.close_box, order=(param[0], d, param[1])).fit(disp=-1)
    except ValueError:
        print('bad parameter combination:', param)
        continue
    aic = model.aic
    if aic < best_aic:
        best_model = model
        best_aic = aic
        best_param = param
    results.append([param, model.aic])
result_table = pd.DataFrame(results)
result_table.columns = ['parameters', 'aic']
print(result_table.sort_values(by = 'aic', ascending=True).head())
print(best_model.summary())
  parameters         aic
1     (0, 1)  141.744687
4     (1, 1)  142.280304
2     (0, 2)  142.411059
6     (2, 0)  143.480131
3     (1, 0)  143.572433
                               SARIMAX Results                                
==============================================================================
Dep. Variable:              close_box   No. Observations:                   49
Model:               SARIMAX(0, 1, 1)   Log Likelihood                 -68.872
Date:                Sun, 07 Apr 2024   AIC                            141.745
Time:                        16:47:18   BIC                            145.487
Sample:                    10-31-2017   HQIC                           143.159
                         - 10-31-2021                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ma.L1          0.4606      0.151      3.053      0.002       0.165       0.756
sigma2         1.0272      0.119      8.661      0.000       0.795       1.260
===================================================================================
Ljung-Box (L1) (Q):                   0.28   Jarque-Bera (JB):               166.77
Prob(Q):                              0.60   Prob(JB):                         0.00
Heteroskedasticity (H):               0.10   Skew:                             1.88
Prob(H) (two-sided):                  0.00   Kurtosis:                        11.32
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
best_model.plot_diagnostics(figsize=(15, 12))
plt.show()
_images/b7b6c99409ab551e9a7752e985fccf084ef64251ff67353346a412d7bb2400f2.png